library(plyr);library(dplyr,warn.conflicts = FALSE);library(tidyr)
library(ggplot2);library(lubridate,warn.conflicts = FALSE)
suppressMessages(library(ggmap))
library(ggrepel);library(RColorBrewer) 
suppressMessages(library(geosphere))
suppressMessages(library(multiwayvcov, warn.conflicts = FALSE));suppressMessages(library(lmtest, warn.conflicts = FALSE))
library(nnet)
suppressMessages(library(stargazer))
library(ineq)
library(stringr)

rm(list=ls()) #clear workspace
#setwd('') #set working directory

#convenience functions
s = function(x){summary(factor(x))} #counts of each value of a variable
Num = function(x){as.numeric(as.character(x))} #convert to number
S = function(x){round(sort(s(x)/nrow(B) * 100, decreasing = T))}

A=read.csv('../data/9-21-18_deid_Poln_trimmed.csv',header=T,na.strings=c('','NA'), stringsAsFactors = F, sep = ';') #9319

A = A[-which(A$NeighN < 20),] #Cut people from neighborhoods with fewer than 20 respondents -> 9247
A = A[which(!is.na(A$A.A7_Area.Neighborhood)),] #Cut respondents who are not assigned to a neighborhood. -> 9026
###########################################################################

###########################################################################
#cleaning and recoding

A$Male = as.numeric(A$C.C5_Gender == 2)
A$Female = A$C.C5_Gender == 2
A$Age = as.numeric(A$C.C4_Age)

A$Religion = mapvalues(A$C.C6_Religion, from = c('Don\'t know', 'Other', 'Refuse to answer'),
                       to = c(NA,NA,NA))
A$Muslim = A$Religion == 'Muslim'
A$Hindu = as.numeric(A$C.C6_Religion == 'Hindu')

A$HiCaste = A$CasteCategory == 'General/BC/OBC'
A$GenCaste = as.numeric(A$C.C8_Caste == 1)
A$C.C8_Caste = factor(mapvalues(A$C.C8_Caste, from = '888', to = NA))

A$C.C10_Mother.Tongue. = factor(mapvalues(A$C.C10_Mother.Tongue., from = c('888','999'), to = c(NA,NA) ) )
A$C.C50_Years.Of.Schooling.[which(A$C.C50_Years.Of.Schooling. > 22)] = NA

A$BJP = A$L.L72_Party.performance == 1
A$L.L72_Party.performance[which(! A$L.L72_Party.performance %in% c(1,2,3,4))] = NA

A$HtH = NA
A$HtH[which(A$J.J3_Source.of.water == 4 & A$Wave != 'Bangalore 2017')] = 1 #two types hth for b17
A$HtH[which(A$J.J3_Source.of.water == 3 & A$Wave == 'Bangalore 2017')] = 1 #two types hth for b17
A$HtH[which(A$J.J3_Source.of.water == 4 & A$Wave == 'Bangalore 2017')] = 1
A$HtH[which(is.na(A$HtH))] = 0#set NA's to 0
A$HtH[which(is.na(A$J.J3_Source.of.water))] = NA #

A$Tanker = A$J.J3_Source.of.water == 1
A$NoToilet = A$J.J4_Type.of.Toilet == 4

A$El = A$J.J5_Electricity == 1
A$Voter = A$L.L8_Voter.ID == 1
A$Ration = A$L.L9_Ration.card == 1
A$ID = A$L.L10_Unique.Id == 1

A$Patta = mapvalues(A$J.J17_Patta, 
                    from = c(1,2,3,4,
                             5,
                             888,999), 
                    to = c(1,1,1,1,
                           0,
                           NA,NA))

A$Evict = A$J.J23_b > 0
A$J.J24_Eviction = mapvalues(A$J.J24_Eviction, from = c(888, 999), to = c(NA, NA))
A$Recog = mapvalues(A$J.J22, from = c(888, 999, -888), to = c(0,0,0)) #code don't knows as 0

A$L.L29_Help.each.other[which(A$L.L29_Help.each.other > 4)] = NA
A$L.L32_Serious.Disagreement[which(A$L.L32_Serious.Disagreement  > 4)] = NA
A$L.L35_Typically.resolved[which(A$L.L35_Typically.resolved > 4)] = NA
A$L.L38_Community.meeting[which(A$L.L38_Community.meeting > 2)]  = NA

A$PrimSch = A$L.L20_Primary.School %>% mapvalues(from = c(1,2,3,4,5,777,888,999),
                                                 to = c(5,4,3,2,1,NA,NA,NA))
A$ElSat = A$L.L25_Electricity %>% mapvalues(from = c(1,2,3,4,5,777,888,999),
                                            to = c(5,4,3,2,1,NA,NA,NA))
A$PolSat = A$L.L26_Local.Policing %>% mapvalues(from = c(1,2,3,4,5,777,888,999),
                                                to = c(5,4,3,2,1,NA,NA,NA))
A$LeaderHelp = A$L.L54_Leader.help %>% mapvalues(from = c(-999, -888, 777, 888, 999),
                                                 to = c(NA, NA, NA, NA, NA))
A$LeaderHelpTruncated = A$LeaderHelp
A$LeaderHelpTruncated[which(A$LeaderHelpTruncated > 5)] = 5
############################################################################################################

############################################################################################################
#PCA
soc_vars_3 = c('L.L28_Community.work','L.L32_Serious.Disagreement','L.L38_Community.meeting')
set.seed(1000)
ind_pca_3 = prcomp( ~ L.L28_Community.work+L.L32_Serious.Disagreement+
                      L.L38_Community.meeting, data = A, scale = T)

#RESULTS FOR TABLE B2
summary(ind_pca_3) 

x_temp = c(1,2,3)
#FIGURE B2
plot(x_temp, ind_pca_3$sdev, pch = 16, main = 'PCA Scree Plot',
     xlab = 'PCA Component', ylab = 'Standard Deviation')
#saved via RStudio dialog; 600x600

PropVar = (ind_pca_3[[1]]^2)/sum(ind_pca_3[[1]]^2)
SocDens2 = - ( ind_pca_3$x[,1]*PropVar[1] - ind_pca_3$x[,2]*PropVar[2] ) / ( PropVar[1] + PropVar[2] )  #is there a principled reason for SUBTRACTING?

#add to main data frame
A[which(complete.cases(A[,soc_vars_3])),'SocDens'] = -ind_pca_3$x[,1]
A[which(complete.cases(A[,soc_vars_3])),'SocDens2'] = SocDens2; rm(SocDens2)

rm(PropVar, soc_vars_3, ind_pca_3)
###############################################################################################################

###############################################################################################################
####Clean and create variables for list experiment ####################################
#List experiment not used, but we drop observations where missing, so need to include this
A$L.SE2_1.L74[which(A$L.SE2_1.L74 > 3)] = NA
A$L.SE2_2.L75[which(A$L.SE2_2.L75 > 4)] = NA
A$L.SE2_3.L76[which(A$L.SE2_3.L76 > 4)] = NA
#drop respondents who have NA for all
A = A[-which(is.na(A[,'L.SE2_1.L74']) & is.na(A[,'L.SE2_2.L75']) & is.na(A[,'L.SE2_3.L76']) ), ] #9206 drops to 9184
##########################################################################################

####################################################################################################
#create variable for slum-level density
slum_dens = A %>% 
  group_by(A.A7_Area.Neighborhood) %>%
  summarise(N = n(),
            Mean_Age = mean(Num(C.C4_Age), na.rm=T),
            SocDens_Mean = mean(SocDens, na.rm=T),
            SocDens_SD = sd(SocDens, na.rm=T),
            City = City[which.max(s(City))],
            Wave = Wave[which.max(s(Wave))]) %>%
  mutate(SocDens_Hi = SocDens_Mean > median(SocDens_Mean))

slum_dens = slum_dens  %>% arrange(City, Wave) %>% mutate(Neigh = seq(1:nrow(slum_dens)) ) %>% data.frame()

A$NeighDens = slum_dens[match(A$A.A7_Area.Neighborhood, slum_dens$A.A7_Area.Neighborhood),'SocDens_Mean']
A$NeighDens2 = slum_dens[match(A$A.A7_Area.Neighborhood, slum_dens$A.A7_Area.Neighborhood),'SocDens2_Mean']
A$NeighDens_Hi = as.numeric(slum_dens[match(A$A.A7_Area.Neighborhood, slum_dens$A.A7_Area.Neighborhood),'SocDens_Hi'])
rm(slum_dens)
########################################################################################################################

########################################################################################################################
Neigh = A %>% group_by(A.A7_Area.Neighborhood) %>% summarise(NeighDens = mean(NeighDens),
                                                             PropMost = mean( NeighPropMost),
                                                             Fractionalization = mean(NeighFractionalization),
                                                             PartyPropMost = sort(table(L.L72_Party.performance,useNA = 'no'),decreasing=T)[1] / n(),
                                                             PartyFrac = 1-sum((table(L.L72_Party.performance)/n())^2),
                                                             NeighVB = mean(L.L67_Effective.vote.bank),
                                                             AssetSum = mean(AssetSum),
                                                             Recog = mean(Recog),
                                                             Evict = mean(Evict, na.rm=T),
                                                             Secure = mean(J.J24_Eviction, na.rm=T),
                                                             NoToilet = mean(NoToilet),
                                                             MostNeed = names(sort(table(L.L19_Public.needs), decreasing = T))[1],
                                                             SecMostNeed = names(sort(table(L.L19_Public.needs), decreasing = T))[2],
                                                             PropMostNeed = sort(table(L.L19_Public.needs), decreasing = T)[1] / n(),
                                                             PropSecMostNeed = sort(table(L.L19_Public.needs), decreasing = T)[2] / n(),
                                                             City = City[1],
                                                             AvgYears = mean(YearsInCity, na.rm = T),    
                                                             PropUnder20 = mean(YearsInCity <= 20, na.rm = T),
                                                             PropUnder10 = mean(YearsInCity <= 10, na.rm = T),
                                                             PropBorn = mean(C.C14_Permanent.Residence.of.Jaipur.),
                                                             El = mean(El, na.rm = T),
                                                             Voter = mean(Voter, na.rm = T),
                                                             Ration = mean(Ration, na.rm = T),
                                                             ID = mean(ID, na.rm = T),
                                                             PrimSch = mean(PrimSch, na.rm = T),
                                                             ElSat = mean(ElSat, na.rm = T),
                                                             PolSat = mean(PolSat, na.rm = T),
                                                             LeaderHelp = mean(LeaderHelp, na.rm = T),
                                                             LeaderHelpTruncated = mean(LeaderHelpTruncated, na.rm = T),
                                                             Tanker = mean(Tanker, na.rm = T),
                                                             NoToilet = mean(NoToilet, na.rm = T),
                                                             Wave = names(sort(table(Wave), decreasing = T))[1],
                                                             PropMuslim = sum(C.C6_Religion == 'Muslim', na.rm = T) / sum(!is.na(C.C6_Religion)),
                                                             PropChristian = sum(C.C6_Religion == 'Christian', na.rm = T) / sum(!is.na(C.C6_Religion)),
                                                             PropSC = sum(CasteCategory == 'SC/ST/Other', na.rm = T) / sum(!is.na(CasteCategory))
                                                             
) %>% data.frame()
#############################################################################

##########################################################################################
#Table H1
lm0 = (lm('Fractionalization ~ NeighDens', data = Neigh[which(Neigh$City == 'Bangalore'),])) 
lm1 = (lm('Fractionalization ~ NeighDens + AssetSum', data = Neigh[which(Neigh$City == 'Bangalore'),]))  
out = stargazer(lm0, lm1,
                covariate.labels = c('Social Density','Asset Score','Constant'),
                dep.var.labels = c('Leadership Fractionalization'), header = F,
                add.lines = list(c('City','Bangalore','Bangalore')),
                title = 'Leadership fractionalization and social density',
                label = 'leaderfrac_dens',
                star.char = c(".", "*", "**", "***"),
                star.cutoffs = c(.1, .05, .01, .001))
writeLines(out,con = paste0('../results/H1_',(Sys.time() %>% str_replace_all(c(":" = "-", " " = "_"))),'.tex'))
rm(lm0, lm1, out)

# #What is magnitude of effect? 
#sd(Neigh$NeighDens[which(Neigh$City == 'Bangalore')]) #0.565935 #IV
#sd(Neigh$Fractionalization[which(Neigh$City == 'Bangalore')]) #0.1568606 #DV
# -0.110738 * (0.565935/ 0.1568606) # = -0.39953 #Coef * (IV-SE/DV-SE): changing the IV by this many SE's changes the DV by this many SE's

#Table H2
lm0 = (lm('PartyFrac ~ NeighDens', data = Neigh)) #- **
lm1 = (lm('PartyFrac ~ NeighDens + AssetSum + City', data = Neigh)) #negative, 1% !
out = stargazer(lm0, lm1,
                covariate.labels = c('Social Density','Asset Score','Jaipur Dummy','Patna Dummy','Constant'),
                dep.var.labels = c('Party Fractionalization'), header = F,
                add.lines = list(c('City','All','All')),
                title = 'Party fractionalization and social density',
                label = 'partyfrac_dens',
                star.char = c(".", "*", "**", "***"),
                star.cutoffs = c(.1, .05, .01, .001))
writeLines(out,con = paste0('../results/H2_',(Sys.time() %>% str_replace_all(c(":" = "-", " " = "_"))),'.tex')); rm(out)
rm(lm0, lm1)

# #What is magnitude of effect?  #sd(Neigh$NeighDens) #0.4708526 #IV #sd(Neigh$PartyFrac) #0.169855 #DV
# -0.102906 * (0.4708526/ 0.169855) # = -0.2852642 #Coef * (IV-SE/DV-SE): changing the IV by this many SE's changes the DV by this many SE's

#Table H3
lm0 = (lm('NeighVB ~ NeighDens', data = Neigh)) #positive, 1%
lm1 = (lm('NeighVB ~ NeighDens + AssetSum + City', data = Neigh)) #positive, 5%!
out = stargazer(lm0, lm1,
                covariate.labels = c('Social Density','Asset Score','Jaipur Dummy','Patna Dummy','Constant'),
                dep.var.labels = c('Proportion Claiming Vote Bank'), header = F,
                add.lines = list(c('City','All','All')),
                title = 'Vote banking and social density',
                label = 'votebank_dens',
                star.char = c(".", "*", "**", "***"),
                star.cutoffs = c(.1, .05, .01, .001))
writeLines(out,con = paste0('../results/H3_',(Sys.time() %>% str_replace_all(c(":" = "-", " " = "_"))),'.tex')); rm(out)
rm(lm0, lm1)

# #What is magnitude of effect? 
#sd(Neigh$NeighDens) #0.4708526 #IV #sd(Neigh$NeighVB) #0.1866523 #DV
# 0.28470 * (0.4708526/ 0.1866523) # = 0.7181896 wow #Coef * (IV-SE/DV-SE): changing the IV by this many SE's changes the DV by this many SE's
#how many percentage points? #Coef * IV-SE * 100 #13.4%!

#########
#Table 5
lm1 = (lm('Fractionalization ~ NeighDens + AssetSum', data = Neigh[which(Neigh$City == 'Bangalore'),])) #old t4
lm2 = (lm('PartyFrac ~ NeighDens + AssetSum + City', data = Neigh)) #old t5
lm3 = (lm('NeighVB ~ NeighDens + AssetSum + City', data = Neigh)) #old t6
out = stargazer(lm1, lm2, lm3,
                covariate.labels = c('Social Density','Asset Score','Jaipur Dummy','Patna Dummy','Constant'),
                dep.var.labels = c(""), #header = F,
                column.labels = c("Leader Frac.","Party Frac.","Vote Bank"), header = F,
                add.lines = list(c('City','Bangalore','All','All')),
                title = 'Leader fractionalization, Party fractionalization, Vote banking, and Social Density',
                no.space = F,
                column.sep.width = '0pt',
                omit.stat = c('f','ser'),
                label = 'Frac-PartyFrac-NeighVB_dens',
                star.char = c(".", "*", "**", "***"),
                star.cutoffs = c(.1, .05, .01, .001))
writeLines(out,con = paste0('../results/T5_',(Sys.time() %>% str_replace_all(c(":" = "-", " " = "_"))),'.tex')); rm(out)
rm(lm1, lm2, lm3)
##########################################################################################

##########################################################################################
Neigh$Toilet = 1 - Neigh$NoToilet
Neigh$NoTanker = 1 - Neigh$Tanker

#Table 6
lm1=(lm('El ~ NeighDens + AssetSum + City', data = Neigh)) 
lm2=(lm('PrimSch ~ NeighDens + AssetSum + City', data = Neigh)) 
lm3=(lm('NoTanker ~ NeighDens + AssetSum + City', data = Neigh)) 
lm4=(lm('Toilet ~ NeighDens + AssetSum + City', data = Neigh))
lm5=(lm('Recog ~ NeighDens + AssetSum + City', data = Neigh)) 
out = stargazer(lm1, lm2, lm3, lm4, lm5, 
                covariate.labels = c('Social Density','Asset Score','Jaipur Dummy','Patna Dummy','Constant'),
                dep.var.labels = c(""), 
                column.labels = c('Electr.','School','Water','Toilet','Recog.'), header = F,
                add.lines = list(c('City','All','All','All','All','All')),
                title = 'Neighborhood services and social density',
                omit.stat = c('f','ser'),
                label = 'Dens_Svcs',
                star.char = c(".", "*", "**", "***"),
                star.cutoffs = c(.1, .05, .01, .001))
writeLines(out,con = paste0('../results/T6_',(Sys.time() %>% str_replace_all(c(":" = "-", " " = "_"))),'.tex'))
rm(out, lm1, lm2, lm3, lm4, lm5)

#Table E3 (Appendix)
lm1 = lm('NeighDens ~ AssetSum', data = Neigh) 
lm2 = lm('AvgYears ~ AssetSum', data = Neigh) 
lm3 = lm('PartyFrac ~ AssetSum + AvgYears', data = Neigh) 
lm4 = lm('PropSC ~ AssetSum + City', data = Neigh) 
out = stargazer(lm1,
                covariate.labels = c('Asset Score'),
                dep.var.labels = c(""),
                column.labels = c('Social Density'),
                add.lines = list(c('City','All')),
                title = 'Social density and assets',
                omit.stat = c('f','ser'),
                label = 'Dens_Assets',
                star.char = c(".", "*", "**", "***"),
                star.cutoffs = c(.1, .05, .01, .001))
writeLines(out,con = paste0('../results/E3_',(Sys.time() %>% str_replace_all(c(":" = "-", " " = "_"))),'.tex'))
rm(out, lm1)

#Table E1
out = stargazer(lm2,
                covariate.labels = c('Asset Score'),
                dep.var.labels = c(""),
                column.labels = c('Average Tenure [years]'),
                add.lines = list(c('City','All')),
                title = 'Residential tenure and assets',
                omit.stat = c('f','ser'),
                label = 'Tenure_Assets',
                star.char = c(".", "*", "**", "***"),
                star.cutoffs = c(.1, .05, .01, .001))
writeLines(out,con = paste0('../results/E1_',(Sys.time() %>% str_replace_all(c(":" = "-", " " = "_"))),'.tex'))
rm(out, lm2)

#Table E2
out = stargazer(lm3,
                covariate.labels = c('Asset Score', 'Avg. Tenure [years]'),
                dep.var.labels = c(""),
                column.labels = c('Party Fractionalization'),
                add.lines = list(c('City','All')),
                title = 'Party fractionalization, assets, and tenure',
                omit.stat = c('f','ser'),
                label = 'Frac_Tenure',
                star.char = c(".", "*", "**", "***"),
                star.cutoffs = c(.1, .05, .01, .001))
writeLines(out,con = paste0('../results/E2_',(Sys.time() %>% str_replace_all(c(":" = "-", " " = "_"))),'.tex'))
rm(out, lm3)

#Table E4
out = stargazer(lm4, 
                covariate.labels = c('Asset Score', 'Jaipur Dummy','Patna Dummy'),
                dep.var.labels = c(""),
                column.labels = c('Proportion Sch. Caste and Sch. Tribes'),
                add.lines = list(c('City','All')),
                title = 'Caste and assets',
                omit.stat = c('f','ser'),
                label = 'Caste_Assets',
                star.char = c(".", "*", "**", "***"),
                star.cutoffs = c(.1, .05, .01, .001))
writeLines(out,con = paste0('../results/E4_',(Sys.time() %>% str_replace_all(c(":" = "-", " " = "_"))),'.tex'))
rm(out, lm4)

#Table E6
lm1 = lm('PartyFrac ~ Fractionalization + City', data = Neigh)
out = stargazer(lm1, 
                covariate.labels = c('Leader Frac.', 'Jaipur Dummy','Patna Dummy'),
                dep.var.labels = c(""),
                column.labels = c('Party Fractionalization'),
                add.lines = list(c('City','All')),
                title = 'Leadership Fractionalization and Party Fractionalization',
                omit.stat = c('f','ser'),
                label = 'Leader_Party_Frac',
                star.char = c(".", "*", "**", "***"),
                star.cutoffs = c(.1, .05, .01, .001))
writeLines(out,con = paste0('../results/E6_',(Sys.time() %>% str_replace_all(c(":" = "-", " " = "_"))),'.tex'))
rm(out, lm1)

#Table E5
lm1=(lm('El ~ NeighDens + AssetSum + City + PropSC + PropMuslim', data = Neigh)) 
lm2=(lm('PrimSch ~ NeighDens + AssetSum + City + PropSC + PropMuslim', data = Neigh)) 
lm3=(lm('NoTanker ~ NeighDens + AssetSum + City + PropSC + PropMuslim', data = Neigh)) 
lm4=(lm('Toilet ~ NeighDens + AssetSum + City + PropSC + PropMuslim', data = Neigh))
lm5=(lm('Recog ~ NeighDens + AssetSum + City + PropSC + PropMuslim', data = Neigh)) 
out = stargazer(lm1, lm2, lm3, lm4, lm5, 
                covariate.labels = c('Social Density','Asset Score','Jaipur Dummy','Patna Dummy','Low Caste','Muslim','Constant'),
                #dep.var.labels = c('Elec.','School','Water','Toilet','Recog.'), header = F,
                dep.var.labels = c(""), #header = F,
                column.labels = c('Electr.','School','Water','Toilet','Recog.'), header = F,
                add.lines = list(c('City','All','All','All','All','All')),
                title = 'Neighborhood services and social density',
                #no.space = T,
                omit.stat = c('f','ser'),
                label = 'Dens_Svcs_CasteMuslim',
                star.char = c(".", "*", "**", "***"),
                star.cutoffs = c(.1, .05, .01, .001))
writeLines(out,con = paste0('../results/E5_',(Sys.time() %>% str_replace_all(c(":" = "-", " " = "_"))),'.tex'))
rm(out, lm1, lm2, lm3, lm4, lm5)

# #What is magnitude of effect? 
# sd(Neigh$NeighDens) #0.4708526 #IV
# sd(Neigh$El) #0.2902129 #DV
# sd(Neigh$PrimSch) #0.5076656 #DV
# sd(Neigh$NoTanker) #0.1565287 #DV
# sd(Neigh$Toilet) #0.2068612 #DV
# sd(Neigh$Recog) #0.2750397 #DV
# 0.102 * (0.4708526/ 0.2902129) # = 0.1654887  #Coef * (IV-SE/DV-SE): changing the IV by this many SE's changes the DV by this many SE's
# 0.367 * (0.4708526/ 0.5076656) # = 0.3403873  #Coef * (IV-SE/DV-SE): changing the IV by this many SE's changes the DV by this many SE's
# 0.064 * (0.4708526/ 0.1565287) # = 0.1925178  #Coef * (IV-SE/DV-SE): changing the IV by this many SE's changes the DV by this many SE's
# 0.079 * (0.4708526/ 0.2068612) # = 0.1798179  #Coef * (IV-SE/DV-SE): changing the IV by this many SE's changes the DV by this many SE's
# 0.184 * (0.4708526/ 0.2750397) # = 0.3149977  #Coef * (IV-SE/DV-SE): changing the IV by this many SE's changes the DV by this many SE's
###################################################################################################

##########################################################################################
#T1
s(A$Wave)
s(A$City)
s(Neigh$Wave)
s(Neigh$City)
##########################################################################################